Github Repository - https://github.com/Aerishgaba/Data-Analysis

##Loading the dataset

IMDB_Action <- read.csv("/Users/aerishgaba/Desktop/Projects/IMDB_Action.csv")

1. Structure of the Dataset

str(IMDB_Action)
## 'data.frame':    7800 obs. of  11 variables:
##  $ Title               : chr  "Anora" "I'm Still Here" "Flow" "Once Upon a Time... in Hollywood" ...
##  $ Title_URL           : chr  "https://www.imdb.com/title/tt28607951/?ref_=sr_t_1" "https://www.imdb.com/title/tt14961016/?ref_=sr_t_2" "https://www.imdb.com/title/tt4772188/?ref_=sr_t_3" "https://www.imdb.com/title/tt7131622/?ref_=sr_t_4" ...
##  $ Image               : chr  "https://m.media-amazon.com/images/M/MV5BYThiN2M0NTItODRmNC00NDhlLWFiYTgtMWM2YTEyYzI3ZTY1XkEyXkFqcGc@._V1_QL75_U"| __truncated__ "https://m.media-amazon.com/images/M/MV5BM2FjMjBiZjgtZDkyYy00YTRlLTk5N2QtODE2ZWIyYWE0Yzg0XkEyXkFqcGc@._V1_QL75_U"| __truncated__ "https://m.media-amazon.com/images/M/MV5BOTM5ODBlOTAtYjcwZi00YzkzLWIzODEtMTM2MTZlNDFmMWU2XkEyXkFqcGc@._V1_QL75_U"| __truncated__ "https://m.media-amazon.com/images/M/MV5BMzMzNmViNjYtN2ViNi00NDM3LWFlMmItNDYyMGIzY2EzZjE2XkEyXkFqcGc@._V1_QL75_U"| __truncated__ ...
##  $ ipclockupoverlay_URL: chr  "https://www.imdb.com/title/tt28607951/?ref_=sr_i_1" "https://www.imdb.com/title/tt14961016/?ref_=sr_i_2" "https://www.imdb.com/title/tt4772188/?ref_=sr_i_3" "https://www.imdb.com/title/tt7131622/?ref_=sr_i_4" ...
##  $ Release.Year        : chr  "2024" "2024" "2024" "2019" ...
##  $ Duration            : chr  "2h 19m" "2h 17m" "1h 25m" "2h 41m" ...
##  $ Rated               : chr  "R" "PG-13" "PG" "R" ...
##  $ Rating              : num  7.6 8.5 7.9 7.6 8.5 8.2 8.7 9.3 8.5 7.7 ...
##  $ Votes               : int  140 90 54 897 610 149 2 3 1 143 ...
##  $ Popularity          : int  91 85 87 84 79 85 74 82 97 94 ...
##  $ Description         : chr  "A young escort from Brooklyn meets and impulsively marries the son of a Russian oligarch. Once the news reaches"| __truncated__ "A mother is forced to reinvent herself when her family's life is shattered by an act of arbitrary violence duri"| __truncated__ "Cat is a solitary animal, but as its home is devastated by a great flood, he finds refuge on a boat populated b"| __truncated__ "As Hollywood's Golden Age is winding down during the summer of 1969, television actor Rick Dalton and his stunt"| __truncated__ ...
df = IMDB_Action

2. Variables of the dataset

variables = names(df)
variables
##  [1] "Title"                "Title_URL"            "Image"               
##  [4] "ipclockupoverlay_URL" "Release.Year"         "Duration"            
##  [7] "Rated"                "Rating"               "Votes"               
## [10] "Popularity"           "Description"

3. First 16 rows of the dataset

top_15 = head(df, 15)
top_15

4. User-defined function to calculate the missing values

missing_values = function(df) {
  n = nrow(df)
  missing = data.frame(
    variable = character(),
    missing_values = numeric(),
    percentage = numeric()
  )
  for (i in 1:ncol(df)) {
    missing[i, 1] = names(df)[i]
    missing[i, 2] = sum(is.na(df[, i]))
    missing[i, 3] = (missing[i, 2] / n) * 100
  }
  return(missing)
}

# Missing values in the dataset
missing_values(df)

5. Filter rows with Votes more then 100

df_filtered = df[df$Votes > 100, ]
df_filtered

6. Independent and Dependent Variables

independent_var <- df$Title
dependent_var <- df$Rating

# Create new dataframe
new_df <- data.frame(independent_var, dependent_var)
new_df

7. Remove missing values from the dataset except Popularity

df_no_na = df[complete.cases(df[, 9]), ]
df_no_na

8. Identify and remove duplicate rows

df_no_duplicates = df_no_na[!duplicated(df_no_na), ]
df_no_duplicates

9. Reorder by column ‘Rating’ in descending order

df_ordered = df_no_duplicates[order(-df_no_duplicates$Rating), ]
df_ordered

10. Rename ‘Release Year’ column to ‘Released’

df_renamed = rename(df_ordered, Released = `Release.Year`)
df_renamed

11. Add new variables in your data frame by multiplying Votes by 2

df_new_var = df_renamed
df_new_var$Votes_x_2 = df_renamed$Votes * 2
df_new_var

12. Create Training Set

Split data (80% training)

set.seed(123)
train_indices <- sample(1:nrow(df_new_var), 0.8 * nrow(df_new_var))
train_set <- df_new_var[train_indices, ]
train_set

13. Summary of the dataset

summary(df_new_var)
##     Title            Title_URL            Image           ipclockupoverlay_URL
##  Length:7338        Length:7338        Length:7338        Length:7338         
##  Class :character   Class :character   Class :character   Class :character    
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character    
##                                                                               
##                                                                               
##                                                                               
##                                                                               
##    Released           Duration            Rated               Rating     
##  Length:7338        Length:7338        Length:7338        Min.   :1.300  
##  Class :character   Class :character   Class :character   1st Qu.:5.300  
##  Mode  :character   Mode  :character   Mode  :character   Median :6.200  
##                                                           Mean   :6.125  
##                                                           3rd Qu.:7.200  
##                                                           Max.   :9.800  
##                                                                          
##      Votes         Popularity     Description          Votes_x_2     
##  Min.   :  1.0   Min.   :  3.00   Length:7338        Min.   :   2.0  
##  1st Qu.:  4.0   1st Qu.: 42.00   Class :character   1st Qu.:   8.0  
##  Median : 21.0   Median : 57.00   Mode  :character   Median :  42.0  
##  Mean   :127.6   Mean   : 57.65                      Mean   : 255.3  
##  3rd Qu.:129.0   3rd Qu.: 73.00                      3rd Qu.: 258.0  
##  Max.   :997.0   Max.   :100.00                      Max.   :1994.0  
##                  NA's   :3936

14. Mean, Mode and Median of Rating column

mean_rating = mean(df_new_var$Rating, na.rm = TRUE)
mode_rating = as.numeric(names(sort(table(df_new_var$Rating), decreasing = TRUE)[1]))
median_rating = median(df_new_var$Rating, na.rm = TRUE)

mean_rating
## [1] 6.124516
mode_rating
## [1] 6.1
median_rating
## [1] 6.2

15. Scatterplot for Rating and Released column

ggplot(df_new_var, aes(x = Rating, y = Released)) +
  geom_point() +
  labs(title = "Scatterplot for Rating and Released column")

16.Bar plot for Rating column

ggplot(df_new_var, aes(x = Rating)) +
  geom_bar() +
  labs(title = "Bar plot for Rated and Rating column by binning the Rating column")

#Barplot for Rating column
ggplot(df_new_var, aes(x = Rated)) +
  geom_bar(fill = "darkred") +
  labs(title = "Bar plot for Rated column")

Converting Released column to numeric

df_new_var$Released = as.numeric(as.character(df_new_var$Released))
summary(df_new_var)
##     Title            Title_URL            Image           ipclockupoverlay_URL
##  Length:7338        Length:7338        Length:7338        Length:7338         
##  Class :character   Class :character   Class :character   Class :character    
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character    
##                                                                               
##                                                                               
##                                                                               
##                                                                               
##     Released      Duration            Rated               Rating     
##  Min.   :1915   Length:7338        Length:7338        Min.   :1.300  
##  1st Qu.:1991   Class :character   Class :character   1st Qu.:5.300  
##  Median :2009   Mode  :character   Mode  :character   Median :6.200  
##  Mean   :2003                                         Mean   :6.125  
##  3rd Qu.:2018                                         3rd Qu.:7.200  
##  Max.   :2025                                         Max.   :9.800  
##  NA's   :1                                                           
##      Votes         Popularity     Description          Votes_x_2     
##  Min.   :  1.0   Min.   :  3.00   Length:7338        Min.   :   2.0  
##  1st Qu.:  4.0   1st Qu.: 42.00   Class :character   1st Qu.:   8.0  
##  Median : 21.0   Median : 57.00   Mode  :character   Median :  42.0  
##  Mean   :127.6   Mean   : 57.65                      Mean   : 255.3  
##  3rd Qu.:129.0   3rd Qu.: 73.00                      3rd Qu.: 258.0  
##  Max.   :997.0   Max.   :100.00                      Max.   :1994.0  
##                  NA's   :3936
# Removing missing values from Released column
df_new_var = df_new_var[complete.cases(df_new_var$Released), ]
summary(df_new_var)
##     Title            Title_URL            Image           ipclockupoverlay_URL
##  Length:7337        Length:7337        Length:7337        Length:7337         
##  Class :character   Class :character   Class :character   Class :character    
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character    
##                                                                               
##                                                                               
##                                                                               
##                                                                               
##     Released      Duration            Rated               Rating     
##  Min.   :1915   Length:7337        Length:7337        Min.   :1.300  
##  1st Qu.:1991   Class :character   Class :character   1st Qu.:5.300  
##  Median :2009   Mode  :character   Mode  :character   Median :6.200  
##  Mean   :2003                                         Mean   :6.124  
##  3rd Qu.:2018                                         3rd Qu.:7.200  
##  Max.   :2025                                         Max.   :9.800  
##                                                                      
##      Votes         Popularity     Description          Votes_x_2     
##  Min.   :  1.0   Min.   :  3.00   Length:7337        Min.   :   2.0  
##  1st Qu.:  4.0   1st Qu.: 42.00   Class :character   1st Qu.:   8.0  
##  Median : 21.0   Median : 57.00   Mode  :character   Median :  42.0  
##  Mean   :127.6   Mean   : 57.65                      Mean   : 255.3  
##  3rd Qu.:129.0   3rd Qu.: 73.00                      3rd Qu.: 258.0  
##  Max.   :997.0   Max.   :100.00                      Max.   :1994.0  
##                  NA's   :3935

17. Pearson corrrelation between Title and Rating

cor(df_new_var$Released, df_new_var$Rating, method = "pearson")
## [1] -0.1972266